---
title: "Cleaning NELDA"
---

# Load

```{r}
# load packages
  source("helper-packages.R")

# load nelda
  nelda_raw <- 
    import("../raw-data/x-nelda/id & q-wide_share.dta")
```

* Check out Serbia and Yugoslavia
* Check out Russia/USSR and what happens with last election there

# Clean data

```{r}
# clean
  nelda_clean <- 
    nelda_raw %>%

  # remove election where month and day cannot be determined [9 cases in Monaco, Maldives, Marshall Islands, Tonga]  
    filter(mmdd != 0) %>%

  # remove an erroneous legislative/parliamentary election in Saudi Arabia in 2020, for which I can find no references
  # remove the historical republic of vietnam and rename
  # remove the two elections from South Yemen (which existed until 1990)
    filter(!country %in% c("Saudi Arabia", "Republic of Vietnam", "South Yemen")) %>%
  
  # generate standardized country names [unmatched countries are listed below]
    mutate(
      nelda_country_name = countryname(countrycode(ccode, origin = 'gwn', destination = 'country.name')),
      nelda_country_name = 
        case_when(
          country == "Sao Tome and Principe" ~ countryname("Sao Tome and Principe"),
          country == "Democratic Republic of Vietnam" ~ countryname("Vietnam"),
          country == "Andorra" ~ countryname("Andorra"),
          TRUE ~ nelda_country_name),

  # preserve nelda election ids for reference      
      nelda_election_id = electionid,

  # variable for election type: constituent assembly/executive/legislative or parliamentary
      nelda_election_type = types,

  # generate election date variable [4 unconcatenated dates from Andora and Sudan, 1977 and previously; inconsequential]
        temp_mmdd = 
          case_when(
            nchar(mmdd) == 4 ~ as.character(mmdd),
            nchar(mmdd) == 3 ~ paste0("0", mmdd)),
      nelda_election_date = as.Date(as.character(paste0(year, temp_mmdd)), format = "%Y%m%d"),

  # descriptive variable: were elections held on schedule according to regular procedure ("nelda6 If regular, were these elections early or late relative to the date they were supposed to be held per established procedure?")
      nelda_6_on_schedule_election =
        case_when(
          nelda6 == "no" ~ 1, # NB: "“No” means that elections took place according to their scheduled date." (from codebook)
          nelda6 == "yes" ~ 0,
          TRUE ~ NA_real_ # = "N/A" and "unclear"
        ),  

  # descriptive variable: Was there significant violence involving civilian deaths immediately before, during, or after the election?
      nelda_33_any_violence_civilian_deaths =
        case_when(
          nelda33 == "yes" ~ 1, # "yes” means that there were civilian deaths (from codebook)
          nelda6 == "no" ~ 0, # "no” means that there were not civilian deaths (from codebook)
          TRUE ~ NA_real_ # = "N/A" and "unclear"
        ),    

  # descriptive variable: were elections competitive ("nelda12 Was the incumbent or ruling party confident of victory before elections?")
      nelda_12_competitive_election = 
        case_when(
          nelda12 == "no" ~ 1, # no means that the incumbent was NOT confident of victory in the election, meaning that it was competitive
          nelda12 == "yes" ~ 0,
          TRUE ~ NA_real_ # = "N/A" and "unclear"
        )) %>% 
  
  # retain only cleaned variables
    select(country, starts_with("nelda_")) %>% 
  
  # drop cases where country name or date is missing
    filter(!is.na(nelda_country_name) & !is.na(nelda_election_date)) %>%

  # important: multiple elections can occur on the same day; i keep only one election; for the two descriptive variables, i retain the MAXIMUM value (i.e. was there a competitive election, was one of the elections according to schedule?)
    group_by(nelda_country_name, nelda_election_date) %>% 
      summarise(
        nelda_12_competitive_election = max(nelda_12_competitive_election, na.rm = T),
        nelda_6_on_schedule_election = max(nelda_6_on_schedule_election, na.rm = T),
        nelda_33_any_violence_civilian_deaths = max(nelda_33_any_violence_civilian_deaths, na.rm = T),
        nelda_election_type = paste0(nelda_election_type, collapse = "; ")) %>% 
    ungroup() %>% 

  # note; infinity values are places where the collapse happens for two values that are both NA; replace infinity values with NA
    mutate(across(where(is.numeric), ~na_if(., Inf)), 
           across(where(is.numeric), ~na_if(., -Inf)))
```

# Save data

```{r}
  saveRDS(nelda_clean, "../cleaned-data/x-1-nelda.rds")
```


Countries in NELDA that are not matched using the countryname functions:

 [1] "Dominica"                         "Grenada"                          "Saint Lucia"                     
 [4] "Saint Vincent and the Grenadines" "Antigua & Barbuda"                "Saint Kitts and Nevis"           
 [7] "Monaco"                           "Liechtenstein"                    "Andorra"                         
[10] "San Marino"                       "Abkhazia"                         "South Ossetia"                   
[13] "Seychelles"                       "Democratic Republic of Vietnam"   "Vanuatu"                         
[16] "Kiribati"                         "Nauru"                            "Tonga"                           
[19] "Tuvalu"                           "Marshall Islands"                 "Palau"                           
[22] "Federated States of Micronesia"   "Samoa/Western Samoa" 

Note, all are either microstates or historical entities that do not appear in the compiled attitudinal data.












